{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "【Task 5】 模型调优（2天）  \n",
    "任务5：使用网格搜索法对5个模型进行调优（调参时采用五折交叉验证的方式），并进行模型评估，记得展示代码的运行结果。   \n",
    "时间：2天  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n",
    "from sklearn.metrics import classification_report\n",
    "import xgboost as xgb\n",
    "import lightgbm as lgb\n",
    "from pandas import DataFrame,Series\n",
    "\n",
    "df = pd.read_csv('data.csv',encoding = 'gbk')\n",
    "\n",
    "\"\"\"\n",
    "数据处理\n",
    "\"\"\"\n",
    "\n",
    "###删除无关特征###\n",
    "df.drop(['Unnamed: 0','custid','trade_no','bank_card_no' ,'source','id_name'],inplace =True,axis = 1)\n",
    "\n",
    "###数据类型转换###（主要针对 obeject（文字类） \n",
    "df['reg_preference_for_trad'].fillna('其他城市',inplace = True)\n",
    "df['reg_preference_for_trad'].replace({'一线城市':1,'二线城市':2,'三线城市':3,'境外':4,'其他城市':5},inplace = True)\n",
    "\n",
    "# 处理日期格式 'latest_query_time', 'loans_latest_time'(暂时去掉？？？)\n",
    "\n",
    "df.drop(['latest_query_time', 'loans_latest_time'],inplace =True,axis = 1)\n",
    "\n",
    "\n",
    "###缺失值处理###\n",
    "df['student_feature'].fillna(0,inplace=True) \n",
    "for i in df.columns:\n",
    "    df[i].fillna(df[i].mode()[0],inplace = True)  #加[0]是因为众数可能有多个，返回不是一个数字"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 评估方法一：切分数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "###切分数据集###\n",
    "\n",
    "y=df['status']\n",
    "x=df.drop('status',axis=1)\n",
    "x_train,x_test,y_train,y_test =train_test_split(x,y,test_size=0.3,random_state = 2018)   \n",
    "\n",
    "features = x_train.columns #目前是83个特征\n",
    "scaler = StandardScaler()\n",
    "x_train = scaler.fit_transform(x_train)\n",
    "x_test = scaler.fit_transform(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0    0.78588   0.92790   0.85101      1068\n",
      "          1    0.53614   0.24791   0.33905       359\n",
      "\n",
      "avg / total    0.72306   0.75683   0.72221      1427\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#随机森林\n",
    "clf_rf = RandomForestClassifier()\n",
    "clf_rf.fit(x_train, y_train)\n",
    "rf_y_pred = clf_rf.predict(x_test)\n",
    "\n",
    "#评估 \n",
    "ans = classification_report(y_test,rf_y_pred,digits=5)\n",
    "print(ans)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 评估方法二：交叉验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.77521008  0.78338591  0.77707676  0.76526316  0.77473684] 0.775134550981\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0    0.79206   0.94664   0.86248      3561\n",
      "          1    0.61847   0.25817   0.36428      1193\n",
      "\n",
      "avg / total    0.74850   0.77387   0.73746      4754\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#  使用交叉验证:直接就可以验证模型，这里用全部数据集而不是训练集\n",
    "from sklearn.model_selection import cross_val_predict,cross_val_score\n",
    "\n",
    "# #先做标准化  这里做不做都一样\n",
    "scaler = StandardScaler()\n",
    "x = scaler.fit_transform(x)\n",
    "x= scaler.fit_transform(x)\n",
    "\n",
    "model_rf = RandomForestClassifier()\n",
    "score_rf = cross_val_score(model_rf, x,y,cv = 5)\n",
    "pred_rf = cross_val_predict(model_rf, x,y,cv = 5)\n",
    "print(score_rf,score_rf.mean())\n",
    "ans1 = classification_report(y,pred_rf,digits=5)\n",
    "print(ans1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型微调"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 方法一：网格搜索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise',\n",
       "       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "            min_samples_leaf=1, min_samples_split=2,\n",
       "            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
       "            oob_score=False, random_state=None, verbose=0,\n",
       "            warm_start=False),\n",
       "       fit_params=None, iid=True, n_jobs=1,\n",
       "       param_grid=[{'max_features': [2, 4, 6, 8], 'n_estimators': [3, 10, 30]}, {'bootstrap': [False], 'max_features': [2, 3, 4], 'n_estimators': [3, 10]}],\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring=None, verbose=0)"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "param_grid = [\n",
    "{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
    "{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n",
    "]\n",
    "rf_model = RandomForestClassifier()\n",
    "grid_search = GridSearchCV(rf_model, param_grid, cv=5)\n",
    "grid_search.fit(x,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "最高得分：0.78776\n",
      "最优参数：n_estimators:30 max_features:8 bootstrap:True\n"
     ]
    }
   ],
   "source": [
    "# 搜索结果\n",
    "print('最高得分：%.5f'% grid_search.best_score_)\n",
    "print('最优参数：n_estimators:{} max_features:{} bootstrap:{}'.format(grid_search.best_estimator_.n_estimators,\\\n",
    "                                                        grid_search.best_estimator_.max_features,grid_search.best_estimator_.bootstrap))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
